# This script creates all graphs and figures related to diasporas' association with violence and racism

# Loading prerequisites
rm(list = ls())

library(tidyr)
library(dplyr)
library(data.table)
library(stringr)
library(ggplot2)
library(modelsummary)
library(sandwich)
library(lmtest)
library(fwildclusterboot)
library(fixest)
library(kableExtra)
library(flextable)
library(extrafont)
library(showtext)

setwd("~/Dropbox/Diaspora_Narratives/Submission/SecStud/Replication")

# Setting fonts
font_location <- "/usr/share/fonts/truetype/ebgaramond/EBGaramond12-Regular.ttf"

font_import(pattern = "Garamond", prompt = FALSE)
font_add(family = "EB Garamond 12", regular = font_location)

set_flextable_defaults(font.family = "Times New Roman")


# Defining functions
summarize_model <- function(df, groups = "Government") {
  grouping_vars <- c("Concept1", "Concept2", groups)
  df |>
  group_by_at(grouping_vars) |>
  summarise(nDict1 = sum(nDict1_mean),
            nDict2 = sum(nDict2_mean),
            Sim_sd = sd(Similarity_mean, na.rm = TRUE),
            Sim_se = Sim_sd / sqrt(n()),
            Similarity = mean(Similarity_mean, na.rm = TRUE),
            Sim_se_up = Similarity + Sim_se * qt(p = 0.975, df = n()),
            Sim_se_low = Similarity - Sim_se * qt(p = 0.975, df = n()),
            N = n()) |>
  arrange_at(grouping_vars)
  }

rob_reg <- function(x) {
  coeftest(x, vcov = vcovCL(x, type="HC1", cluster = ~ Dictionary2))
}

conf_int_fun <- function(reg, label, r = 250, core = 2, var = "Government") {
  a1 <- coeftest(reg, vcovCL(reg, type="HC1")) |>
    confint()
  a2 <- coeftest(reg, vcov = vcovCL, type="HC1", cluster = ~ Dictionary2) |>
    confint()
  a3 <- coeftest(reg, vcovBS(reg, cluster = ~Dictionary2, type = "wild", R = r, core = 4)) |>
    confint()

  ci <- rbind(a1[var,],
              a2[var,],
              a3[var,]
              )
  df <- data.frame(ci,
                   coef = reg$coefficients[var],
                   label,
                   clustering = c("Heteroskedastistic",
                                  "Cluster SE (Attribute Dict)",
                                  "Bootstrap (Attribute Dict)"))
  names(df)[1:2] <- c("Pct_2.5", "Pct_97.5")
  return(df)
}

modelsummary_to_tex <- function(x, cap = NULL, totex = TRUE) {
  form <- ifelse(totex, "latex", "html")
  x |> 
    mutate(term = ifelse(statistic == "std.error", "", term)) |> 
    select(-part, -statistic) |> 
    rename(` ` = term) |> 
    kbl(format = form, booktabs = TRUE, caption = cap)
}

# Reading in data

df <- fread("02_data/diaspora_data.csv")

### Graphs

df_temp <- df |>
  summarize_model() |>
  mutate(Concept1 = Concept1 |>
           str_to_title() %>%
           factor(., levels = c("Diaspora", "Black", "White")),
         Concept2 = Concept2 %>%
           str_to_title() %>%
           factor(., levels = c("Racism", "Violence")),
         Government_lab = ifelse(Government == 1, "Government", "Independent"))

ggplot(df_temp, aes(x = Similarity * 100, y = Government_lab)) +
  geom_point(size = 2.5) +
  geom_errorbar(aes(xmax = Sim_se_up * 100, xmin = Sim_se_low * 100), width = .15) +
  facet_grid(Concept1 ~ Concept2) +
  labs(y = "Media Affiliation", x = "Similarity (%)") +
  theme_bw() +
  theme(text = element_text(family = "EB Garamond 12", size = 12))

ggsave("03_output/Graphs/affiliation_attribute_ethnicity_crossection.pdf", height = 6, width = 6)
ggsave("03_output/Graphs/affiliation_attribute_ethnicity_crossection.png", height = 6, width = 6)

df_temp1 <- df %>%
  mutate(Concept1 = Concept1 %>%
           str_to_title() %>%
           factor(., levels = c("Diaspora", "Black", "White")),
         Concept2 = Concept2 %>%
           str_to_title() %>%
           factor(., levels = c("Racism", "Violence")),
         Government_lab = ifelse(Government == 1, "Government", "Independent")) %>%
  filter(Concept1 == "Diaspora") %>%
  summarize_model(groups = c("Eng", "Government_lab")) %>%
  mutate(Similarity = Similarity * 100) %>%
  ungroup()

df_temp_label <- df %>%
  mutate(Concept1 = Concept1 %>%
           str_to_title() %>%
           factor(., levels = c("Diaspora", "Black", "White")),
         Concept2 = Concept2 %>%
           str_to_title() %>%
           factor(., levels = c("Racism", "Violence")),
         Government_lab = ifelse(Government == 1, "Government", "Independent")) %>%
  filter(Concept1 == "Diaspora") %>%
  summarize_model(groups = c("Government_lab")) %>%
  mutate(Similarity = Similarity * 100) %>%
  ungroup()

ordering <- df_temp1$Eng[df_temp1$Similarity[df_temp1$Concept2=="Racism"] %>% order(decreasing = FALSE)] %>% as.character()

df_temp1 <- df_temp1 %>%
  mutate(Eng = factor(Eng, levels = ordering))

ggplot(df_temp1, aes(x = Similarity, y = Eng)) +
  geom_vline(aes(xintercept = Similarity), linetype = "dashed", data = df_temp_label, show.legend = FALSE) +
  geom_point(size = 2.5) +
  geom_errorbar(aes(xmax = Sim_se_up * 100, xmin = Sim_se_low * 100), width = .15) +
  facet_grid(Government_lab ~ Concept2, scales = "free") +
  labs(y = "Media Affiliation", x = "Similarity (%)") +
  theme_bw() +
    theme(text = element_text(family = "EB Garamond 12", size = 12))

ggsave("03_output/Graphs/affiliation_attribute_ethnicity_crossection_ungrouped.pdf", height = 6, width = 8)
ggsave("03_output/Graphs/affiliation_attribute_ethnicity_crossection_ungrouped.png", height = 6, width = 8)

ggplot(df_temp1, aes(x = Similarity, y = Eng)) +
  geom_point(size = 2.5) +
  geom_errorbar(aes(xmax = Sim_se_up * 100, xmin = Sim_se_low * 100), width = .15) +
  facet_grid(Concept1 ~ Concept2) +
  labs(y = "Media Affiliation", x = "Similarity (%)") +
  theme_bw()+
    theme(text = element_text(family = "EB Garamond 12", size = 12))


df_temp2 <- df %>%
  group_by(Concept1, Concept2, Government) %>%
  summarize(Similarity = mean(Similarity_mean, na.rm = TRUE)) %>%
  group_by(Concept1, Concept2) %>%
  summarise(Similarity = Similarity[2] - Similarity[1])

### Regression Tables

star_map <- c('*' = 0.1, '**' = 0.05, '***' = 0.01)

table_note <- list("Notes: $^{*}$p$<$0.1; $^{**}$p$<$0.05; $^{***}$p$<$0.01 White robust standard-errors are clustered on the attribute dictionary. Dependent variable is cosine similarity between respective object and attribute dictionaries. Controls for dictionary frequency are shown. Unit of analysis is the object-attribute word pair, which varies according to the object and attribute dictionary size.")
table_note_html <- list("Notes: * p<0.1; ** p<0.05; *** p<0.01 White robust standard-errors are clustered on the attribute dictionary. Dependent variable is cosine similarity between respective object and attribute dictionaries. Controls for dictionary frequency are shown. Unit of analysis is the object-attribute word pair, which varies according to the object and attribute dictionary size.")

#### Individual

df_dict_disp_vl <- df %>%
  filter(Concept1 == "diaspora", Concept2 == "violence")
df_dict_disp_rc <- df %>%
  filter(Concept1 == "diaspora",Concept2 == "racism")

reg_disp_vl_ncnf <- lm(Similarity_mean * 100 ~ Government, data = df_dict_disp_vl) %>%
  rob_reg()
reg_disp_rc_ncnf <- lm(Similarity_mean * 100 ~ Government, data = df_dict_disp_rc) %>%
  rob_reg()
reg_disp_vl_cnf <- lm(Similarity_mean * 100 ~ Government + log(nDict1_mean) + log(nDict2_mean), data = df_dict_disp_vl) %>%
  rob_reg()
reg_disp_rc_cnf <- lm(Similarity_mean * 100 ~ Government + log(nDict1_mean) + log(nDict2_mean), data = df_dict_disp_rc) %>%
  rob_reg()
reg_disp_vl_ncf <- lm(Similarity_mean * 100 ~ Government + Dictionary1 + Dictionary2, data = df_dict_disp_vl) %>%
  rob_reg()
reg_disp_rc_ncf <- lm(Similarity_mean * 100 ~ Government + Dictionary1 + Dictionary2, data = df_dict_disp_rc) %>%
  rob_reg()
reg_disp_vl_cf <- lm(Similarity_mean * 100 ~ Government + log(nDict1_mean) + log(nDict2_mean) + Dictionary1 + Dictionary2, data = df_dict_disp_vl) %>%
  rob_reg()
reg_disp_rc_cf <- lm(Similarity_mean * 100 ~ Government + log(nDict1_mean) + log(nDict2_mean) + Dictionary1 + Dictionary2, data = df_dict_disp_rc) %>%
  rob_reg()

dict <- c("Government", "ln Frequency 1", "ln Frequency 2")
names(dict) <- c("Government", "log(nDict1_mean)", "log(nDict2_mean)")


model_list <- list(reg_disp_rc_ncnf,
                   reg_disp_rc_cnf,
                   reg_disp_rc_ncf,
                   reg_disp_rc_cf,
                   reg_disp_vl_ncnf,
                   reg_disp_vl_cnf,
                   reg_disp_vl_ncf,
                   reg_disp_vl_cf)

model_names <- paste0(c(rep("Racism", 4), rep("Violence", 4)), " (", 1:8, ")")
names(model_list) <- model_names

gm <- tribble(~raw, ~clean, ~fmt,
              "r.squared", "R Squared", 2,
              "nobs", "Observations", 0)

rows <- rbind(c('Dictionary FE', rep(c("No", "No", "Yes", "Yes"), times = 2))) %>%
  as_tibble()

colnames(rows) <- c('term', model_names)

tab0 <-
  modelsummary(model_list,
               coef_map = dict,
               coef_omit = c("^Eng|^Year|^Dictionary"),
               gof_map = gm,
               estimate = "{estimate}{stars}",
               add_rows = rows,
               fmt = "%.2f",
               stars = star_map,
               output = 'data.frame',
               escape = FALSE) |>
  modelsummary_to_tex(cap = "Effect of Government Affiliation on Racism and Violence Framing\\label{tab:embedreg}", totex = TRUE) |> 
  kable_styling(latex_options = c("hold_position", "scale_down"), full_width = F) |>
  kableExtra::footnote(general = table_note, escape = FALSE, threeparttable = TRUE, general_title = "") |>
  row_spec(6, hline_after = TRUE) %>%
  pack_rows("Statistics", 7, 7, bold = FALSE, italic = TRUE, label_row_css = "", latex_gap_space = "") %>%
  pack_rows("Fixed effects", 8, 8, bold = FALSE, italic = TRUE, label_row_css = "", latex_gap_space = "") %>%
  add_header_above(c(" " = 1, "Similarity with Diaspora Dictionary (%)" = length(model_list)), underline = FALSE, italic = TRUE, line = FALSE)

save_kable(tab0, file = "03_output/Tables/table_ethnic.tex")


## Export to html for docx

fp_text <- fp_text_default(italic = TRUE)
pars_dv_w <- as_paragraph(as_chunk(c("","Similarity with Diaspora Dictionary (%)"), props = fp_text))
pars_dv_wo <- as_paragraph(as_chunk(c("","Similarity (%)"), props = fp_text))
pars_type <- as_paragraph(as_chunk(c("","Racism","Violence")))

tab0_flex <-
  modelsummary(model_list,
               coef_map = dict,
               coef_omit = c("^Eng|^Year|^Dictionary"),
               gof_map = gm,
               estimate = "{estimate}{stars}",
               add_rows = rows,
               fmt = "%.2f",
               stars = star_map,
               title = "Effect of Government Affiliation on Racism and Violence Framing",
               notes = table_note_html,
               output = 'flextable',
               escape = FALSE) %>%
  hline(6) |>
  add_header_row(values = pars_dv_w,
                 colwidths = c(1, length(model_list)), top = TRUE) |>
  align_text_col(align = "center", header = TRUE, footer = FALSE)

save_as_docx(tab0_flex, path = "03_output/Tables/table_ethnic.docx")

# Across groups

df_dict_disp_vl <- df %>%
  filter(Concept1 == "diaspora", Concept2 == "violence")
df_dict_disp_rc <- df %>%
  filter(Concept1 == "diaspora",Concept2 == "racism")
df_dict_wh_vl <- df %>%
  filter(Concept1 == "white", Concept2 == "violence")
df_dict_wh_rc <- df %>%
  filter(Concept1 == "white",Concept2 == "racism")
df_dict_bl_vl <- df %>%
  filter(Concept1 == "black", Concept2 == "violence")
df_dict_bl_rc <- df %>%
  filter(Concept1 == "black",Concept2 == "racism")

reg_disp_vl_cf <- lm(Similarity_mean * 100 ~ Government + log(nDict1_mean) + log(nDict2_mean) + Dictionary1 + Dictionary2, data = df_dict_disp_vl) %>%
  rob_reg()
reg_disp_rc_cf <- lm(Similarity_mean * 100 ~ Government + log(nDict1_mean) + log(nDict2_mean) + Dictionary1 + Dictionary2, data = df_dict_disp_rc) %>%
  rob_reg()
reg_wh_vl_cf <- lm(Similarity_mean * 100 ~ Government + log(nDict1_mean) + log(nDict2_mean) + Dictionary2, data = df_dict_wh_vl) %>%
  rob_reg()
reg_wh_rc_cf <- lm(Similarity_mean * 100 ~ Government + log(nDict1_mean) + log(nDict2_mean) + Dictionary2, data = df_dict_wh_rc) %>%
  rob_reg()
reg_bl_vl_cf <- lm(Similarity_mean * 100 ~ Government + log(nDict1_mean) + log(nDict2_mean) + Dictionary1 + Dictionary2, data = df_dict_bl_vl) %>%
  rob_reg()
reg_bl_rc_cf <- lm(Similarity_mean * 100 ~ Government + log(nDict1_mean) + log(nDict2_mean) + Dictionary1 + Dictionary2, data = df_dict_bl_rc) %>%
  rob_reg()

dict <- c("Government", "ln Frequency 1", "ln Frequency 2")
names(dict) <- c("Government", "log(nDict1_mean)", "log(nDict2_mean)")
model_list <- list(reg_disp_rc_cf,
                   reg_wh_rc_cf,
                   reg_bl_rc_cf,
                   reg_disp_vl_cf,
                   reg_wh_vl_cf,
                   reg_bl_vl_cf)

model_names <- paste(rep(c("Diaspora", "White", "Black"), times = 2), paste0("(",1:6,")"))
names(model_list) <- model_names

gm <- tribble(~raw, ~clean, ~fmt,
              "r.squared", "R Squared", 2,
              "nobs", "Observations", 0)

rows <- rbind(c('Dictionary FE', rep(c("Yes"), times = 6))) %>%
  as_tibble()

colnames(rows) <- c("term", model_names)

tab_ethnic <-
  modelsummary(model_list,
               coef_map = dict,
               coef_omit = c("^Eng|^Year|^Dictionary"),
               gof_map = gm,
               estimate = "{estimate}{stars}",
               add_rows = rows,
               fmt = "%.2f",
               stars = star_map,
               output = 'data.frame',
               escape = FALSE) %>%
  modelsummary_to_tex(cap = "Effect of Government Affiliation on Racism and Violence Framing over Ethnicity\\label{tab:ethnicityreg}", totex = TRUE) |> 
  kable_styling(latex_options = c("hold_position", "scale_down"), full_width = F) %>%
  kableExtra::footnote(general = table_note, escape = FALSE, threeparttable = TRUE, general_title = "") %>%
  row_spec(6, hline_after = TRUE) %>%
  pack_rows("Statistics", 7, 7, bold = FALSE, italic = TRUE, label_row_css = "", latex_gap_space = "") %>%
  pack_rows("Fixed effects", 8, 8, bold = FALSE, italic = TRUE, label_row_css = "", latex_gap_space = "") %>%
  add_header_above(c(" " = 1, "Racism" = 3, "Violence" = 3)) %>%
  add_header_above(c(" " = 1, "Similarity (%)" = length(model_list)), underline = FALSE, italic = TRUE, line = FALSE)


save_kable(tab_ethnic, file = "03_output/Tables/table_ethnic_group.tex")

tab_ethnic_flex <-
  modelsummary(model_list,
               coef_map = dict,
               coef_omit = c("^Eng|^Year|^Dictionary"),
               gof_map = gm,
               estimate = "{estimate}{stars}",
               add_rows = rows,
               fmt = "%.2f",
               stars = star_map,
               title = "Effect of Government Affiliation on Racism and Violence Framing over Ethnicity",
               notes = table_note_html,
               output = 'flextable',
               escape = FALSE) |>
  hline(6) |>
  add_header_row(values = pars_type, colwidths = c(1,3,3), top = TRUE) |>
  align_text_col(align = "center", header = TRUE, footer = FALSE) |>
  add_header_row(values = pars_dv_wo, colwidths = c(1,6), top = TRUE)

save_as_docx(tab_ethnic_flex, path = "03_output/Tables/table_ethnic_group.docx")


#### Robustness checks

##### Standard errors

reg_rc_lm <-
  df_dict_disp_rc %>%
  lm(Similarity_mean * 100 ~ Government + log(nDict1_mean) + log(nDict2_mean) + Dictionary1 + Dictionary2, data = .)

reg_vl_lm <-
  df_dict_disp_vl %>%
  lm(Similarity_mean * 100 ~ Government + log(nDict1_mean) + log(nDict2_mean) + Dictionary1 + Dictionary2, data = .)


regressions <- list(reg_rc_lm, reg_vl_lm)
names(regressions) <- c("Race", "Violence")

out <- lapply(1:length(regressions), function(x) conf_int_fun(reg = regressions[[x]], label = names(regressions)[x], var = "Government"))
out_df <- do.call("rbind", out)

ggplot(data = out_df, aes(y = clustering, x = coef, linetype = clustering, color = clustering)) +
  geom_point() +
  geom_errorbar(aes(xmin = Pct_2.5, xmax = Pct_97.5), width=.1) +
  geom_vline(aes(xintercept = 0), linetype = "dashed", color = "red") +
  facet_grid(. ~ label) +
  theme_bw() +
  scale_color_grey() +
  labs(color = "Standard Error", y = "Change in Similarity (%) with Diaspora", x = "", linetype = "Standard Error") +
  theme(axis.text.y=element_blank(),
        axis.ticks.y=element_blank()) +
    theme(text = element_text(family = "EB Garamond 12", size = 12))

ggsave("03_output/Graphs/ethnic_se_robustness.pdf", height = 6, width = 9)
ggsave("03_output/Graphs/ethnic_se_robustness.png", height = 6, width = 9)



reg_rc_lm <- lm(Similarity_mean * 100 ~ Government + log(nDict1_mean) + log(nDict2_mean) + Dictionary1 + Dictionary2, data = df_dict_disp_rc)

reg_vl_lm <- lm(Similarity_mean * 100 ~ Government + log(nDict1_mean) + log(nDict2_mean) + Dictionary1 + Dictionary2, data = df_dict_disp_vl)

regressions <- list(reg_rc_lm, reg_vl_lm)
names(regressions) <- c("Racism", "Violence")

se_reg_fun <- function(reg, r = 250, core = 2) {
  a1 <- coeftest(reg, vcovCL(reg, type="HC1"))
  a2 <- coeftest(reg, vcovCL(reg, type="HC1", cluster = ~ Dictionary1))
  a3 <- coeftest(reg, vcovBS(reg, cluster = ~ Dictionary1, type = "wild", R = r, core = 4))
  a4 <- coeftest(reg, vcov = vcovCL, type="HC1", cluster = ~ Dictionary2)
  a5 <- coeftest(reg, vcovBS(reg, cluster = ~ Dictionary2, type = "wild", R = r, core = 4))
  out <- list(a1,
              a2,
              a3,
              a4,
              a5)
  return(out)
}

out_reg <- lapply(1:length(regressions), function(x) se_reg_fun(reg = regressions[[x]])) %>%
  unlist(recursive = FALSE)

name_dict <- c("Hetero.",
               "Cluster (Obj)", "Boot (Obj)",
               "Cluster (Attr)", "Boot (Attr)")
model_labels <- paste0(rep(name_dict, 2), " (", 1:(length(name_dict)*2), ")")
names(out_reg) <- model_labels

dict <- c("Government", "ln Frequency 1", "ln Frequency 2")
names(dict) <- c("Government", "log(nDict1_mean)", "log(nDict2_mean)")

model_list <- out_reg
gm <- tribble(~raw, ~clean, ~fmt,
              "r.squared", "R Squared", 2,
              "nobs", "Observations", 0)


rows <- rbind(c('Dictionary FE', rep(c("Yes"), times = length(out_reg)))) %>%
  as_tibble()

names(rows) <- c("term", model_labels)

table_note_se <- list("Notes: $^{*}$p$<$0.1; $^{**}$p$<$0.05; $^{***}$p$<$0.01 Regressions labeled with ``Boot\" use Wild Bootstrapping to compute standard errors. Dependent variable is cosine similarity between respective object and attribute dictionaries. Controls for dictionary frequency are shown. Unit of analysis is the object-attribute word pair, which varies according to the object and attribute dictionary size.")
table_note_se_html <- list("Notes: * p<0.1; ** p<0.05; *** p<0.01 Regressions labeled with \"Boot\" use Wild Bootstrapping to compute standard errors. Dependent variable is cosine similarity between respective object and attribute dictionaries. Controls for dictionary frequency are shown. Unit of analysis is the object-attribute word pair, which varies according to the object and attribute dictionary size.")


tab_se <-
  modelsummary(model_list,
               coef_map = dict,
               coef_omit = c("^Eng|^Year|^Dictionary"),
               gof_map = gm,
               estimate = "{estimate}{stars}",
               add_rows = rows,
               fmt = "%.2f",
               stars = star_map,
               notes = list("Regressions labeled with ``Boot\" use Wild Bootstrapping to compute standard errors.", "Notes: * p<0.1; $^{**}$p$<$0.05; $^{***}$p$<$0.01"),
               output = 'data.frame',
               escape = FALSE) %>%
  modelsummary_to_tex(cap = "Government Framing of Diaspora with Racism and Violence by Standard Error Type\\label{tab:ethnicse}", totex = TRUE) |>
  kable_styling(latex_options = c("hold_position", "scale_down")) %>%
  kableExtra::footnote(general = table_note_se, escape = FALSE, threeparttable = TRUE, general_title = "") %>%
  row_spec(6, hline_after = TRUE) %>%
  pack_rows("Statistics", 7, 7, bold = FALSE, italic = TRUE, label_row_css = "", latex_gap_space = "") %>%
  pack_rows("Fixed effects", 8, 8, bold = FALSE, italic = TRUE, label_row_css = "", latex_gap_space = "") %>%
  add_header_above(c(" " = 1, "Racism" = 5, "Violence" = 5)) %>%
  add_header_above(c(" " = 1, "Similarity with Diaspora Dictionary (%)" = 10), line = FALSE, italic = TRUE) %>%
  landscape()

save_kable(tab_se, file = "03_output/Tables/ethnic_table_se.tex")

tab_se_flex <-
  modelsummary(model_list,
               coef_map = dict,
               coef_omit = c("^Eng|^Year|^Dictionary"),
               gof_map = gm,
               estimate = "{estimate}{stars}",
               add_rows = rows,
               fmt = "%.2f",
               stars = star_map,
               title = "Government Framing of Diaspora with Racism and Violence by Standard Error Type",
               notes = table_note_se_html,
               output = 'flextable',
               escape = FALSE) |>
  hline(6) |>
  add_header_row(values = pars_type, colwidths = c(1,5,5), top = TRUE) |>
  align_text_col(align = "center", header = TRUE, footer = FALSE) |>
  add_header_row(values = pars_dv_w, colwidths = c(1,10), top = TRUE)

save_as_docx(tab_se_flex, path = "03_output/Tables/ethnic_table_se.docx")
